{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading train texts: 100%|██████████| 18846/18846 [00:03<00:00, 5846.48it/s]\n",
      "parsing texts: 100%|██████████| 18846/18846 [00:01<00:00, 14482.66it/s]\n",
      "2024-06-19 22:51:45,714 - TopMost - Real vocab size: 10000\n",
      "2024-06-19 22:51:45,810 - TopMost - Real training size: 18846 \t avg length: 68.727\n",
      "loading word embeddings: 100%|██████████| 10000/10000 [00:02<00:00, 4591.78it/s]\n",
      "2024-06-19 22:52:14,429 - TopMost - number of found embeddings: 9869/10000\n"
     ]
    }
   ],
   "source": [
    "import topmost\n",
    "from topmost import RawDataset, Preprocess\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "\n",
    "docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n",
    "preprocess = Preprocess(vocab_size=10000)\n",
    "\n",
    "device = 'cuda' # or 'cpu'\n",
    "dataset = RawDataset(docs, preprocess, device=device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-06-19 22:52:16,761 - FASTopic - use device: cuda\n",
      "loading train texts: 100%|██████████| 18846/18846 [00:01<00:00, 16028.10it/s]\n",
      "parsing texts: 100%|██████████| 18846/18846 [00:01<00:00, 17851.33it/s]\n",
      "2024-06-19 22:52:21,228 - TopMost - Real vocab size: 10000\n",
      "2024-06-19 22:52:21,324 - TopMost - Real training size: 18846 \t avg length: 68.727\n",
      "Training FASTopic:   0%|          | 0/200 [00:00<?, ?it/s]2024-06-19 22:52:26,581 - FASTopic - Epoch: 005 loss: 635.259\n",
      "Training FASTopic:   2%|▎         | 5/200 [00:00<00:04, 48.15it/s]2024-06-19 22:52:26,723 - FASTopic - Epoch: 010 loss: 631.099\n",
      "Training FASTopic:   5%|▌         | 10/200 [00:00<00:04, 39.53it/s]2024-06-19 22:52:26,865 - FASTopic - Epoch: 015 loss: 621.783\n",
      "Training FASTopic:  10%|▉         | 19/200 [00:00<00:05, 32.83it/s]2024-06-19 22:52:27,064 - FASTopic - Epoch: 020 loss: 612.178\n",
      "Training FASTopic:  12%|█▏        | 23/200 [00:00<00:05, 29.85it/s]2024-06-19 22:52:27,262 - FASTopic - Epoch: 025 loss: 605.041\n",
      "Training FASTopic:  14%|█▎        | 27/200 [00:00<00:06, 28.13it/s]2024-06-19 22:52:27,460 - FASTopic - Epoch: 030 loss: 598.067\n",
      "Training FASTopic:  16%|█▋        | 33/200 [00:01<00:06, 26.90it/s]2024-06-19 22:52:27,659 - FASTopic - Epoch: 035 loss: 591.410\n",
      "Training FASTopic:  20%|█▉        | 39/200 [00:01<00:06, 26.07it/s]2024-06-19 22:52:27,857 - FASTopic - Epoch: 040 loss: 586.154\n",
      "Training FASTopic:  21%|██        | 42/200 [00:01<00:06, 25.77it/s]2024-06-19 22:52:28,056 - FASTopic - Epoch: 045 loss: 582.193\n",
      "Training FASTopic:  24%|██▍       | 48/200 [00:01<00:05, 25.65it/s]2024-06-19 22:52:28,254 - FASTopic - Epoch: 050 loss: 578.970\n",
      "Training FASTopic:  27%|██▋       | 54/200 [00:01<00:05, 25.44it/s]2024-06-19 22:52:28,453 - FASTopic - Epoch: 055 loss: 576.228\n",
      "Training FASTopic:  28%|██▊       | 57/200 [00:02<00:05, 25.35it/s]2024-06-19 22:52:28,651 - FASTopic - Epoch: 060 loss: 573.838\n",
      "Training FASTopic:  32%|███▏      | 63/200 [00:02<00:05, 25.41it/s]2024-06-19 22:52:28,850 - FASTopic - Epoch: 065 loss: 571.716\n",
      "Training FASTopic:  34%|███▍      | 69/200 [00:02<00:05, 25.33it/s]2024-06-19 22:52:29,049 - FASTopic - Epoch: 070 loss: 569.789\n",
      "Training FASTopic:  36%|███▌      | 72/200 [00:02<00:05, 25.25it/s]2024-06-19 22:52:29,247 - FASTopic - Epoch: 075 loss: 568.035\n",
      "Training FASTopic:  39%|███▉      | 78/200 [00:02<00:04, 25.37it/s]2024-06-19 22:52:29,446 - FASTopic - Epoch: 080 loss: 566.417\n",
      "Training FASTopic:  42%|████▏     | 84/200 [00:03<00:04, 25.29it/s]2024-06-19 22:52:29,644 - FASTopic - Epoch: 085 loss: 564.911\n",
      "Training FASTopic:  44%|████▎     | 87/200 [00:03<00:04, 25.23it/s]2024-06-19 22:52:29,843 - FASTopic - Epoch: 090 loss: 563.489\n",
      "Training FASTopic:  46%|████▋     | 93/200 [00:03<00:04, 25.36it/s]2024-06-19 22:52:30,042 - FASTopic - Epoch: 095 loss: 562.139\n",
      "Training FASTopic:  50%|████▉     | 99/200 [00:03<00:03, 25.29it/s]2024-06-19 22:52:30,240 - FASTopic - Epoch: 100 loss: 560.842\n",
      "Training FASTopic:  51%|█████     | 102/200 [00:03<00:03, 25.24it/s]2024-06-19 22:52:30,438 - FASTopic - Epoch: 105 loss: 559.586\n",
      "Training FASTopic:  54%|█████▍    | 108/200 [00:04<00:03, 25.33it/s]2024-06-19 22:52:30,637 - FASTopic - Epoch: 110 loss: 558.376\n",
      "Training FASTopic:  57%|█████▋    | 114/200 [00:04<00:03, 25.31it/s]2024-06-19 22:52:30,836 - FASTopic - Epoch: 115 loss: 557.210\n",
      "Training FASTopic:  58%|█████▊    | 117/200 [00:04<00:03, 25.26it/s]2024-06-19 22:52:31,034 - FASTopic - Epoch: 120 loss: 556.090\n",
      "Training FASTopic:  62%|██████▏   | 123/200 [00:04<00:03, 25.24it/s]2024-06-19 22:52:31,233 - FASTopic - Epoch: 125 loss: 555.000\n",
      "Training FASTopic:  64%|██████▍   | 129/200 [00:04<00:02, 25.31it/s]2024-06-19 22:52:31,432 - FASTopic - Epoch: 130 loss: 553.980\n",
      "Training FASTopic:  66%|██████▌   | 132/200 [00:05<00:02, 25.24it/s]2024-06-19 22:52:31,630 - FASTopic - Epoch: 135 loss: 553.012\n",
      "Training FASTopic:  69%|██████▉   | 138/200 [00:05<00:02, 25.34it/s]2024-06-19 22:52:31,829 - FASTopic - Epoch: 140 loss: 552.069\n",
      "Training FASTopic:  72%|███████▏  | 144/200 [00:05<00:02, 25.31it/s]2024-06-19 22:52:32,027 - FASTopic - Epoch: 145 loss: 551.181\n",
      "Training FASTopic:  74%|███████▎  | 147/200 [00:05<00:02, 25.26it/s]2024-06-19 22:52:32,226 - FASTopic - Epoch: 150 loss: 550.317\n",
      "Training FASTopic:  76%|███████▋  | 153/200 [00:05<00:01, 25.37it/s]2024-06-19 22:52:32,424 - FASTopic - Epoch: 155 loss: 549.469\n",
      "Training FASTopic:  80%|███████▉  | 159/200 [00:06<00:01, 25.31it/s]2024-06-19 22:52:32,623 - FASTopic - Epoch: 160 loss: 548.640\n",
      "Training FASTopic:  81%|████████  | 162/200 [00:06<00:01, 25.24it/s]2024-06-19 22:52:32,822 - FASTopic - Epoch: 165 loss: 547.816\n",
      "Training FASTopic:  84%|████████▍ | 168/200 [00:06<00:01, 25.36it/s]2024-06-19 22:52:33,020 - FASTopic - Epoch: 170 loss: 546.999\n",
      "Training FASTopic:  87%|████████▋ | 174/200 [00:06<00:01, 25.33it/s]2024-06-19 22:52:33,219 - FASTopic - Epoch: 175 loss: 546.217\n",
      "Training FASTopic:  88%|████████▊ | 177/200 [00:06<00:00, 25.26it/s]2024-06-19 22:52:33,417 - FASTopic - Epoch: 180 loss: 545.468\n",
      "Training FASTopic:  92%|█████████▏| 183/200 [00:07<00:00, 25.36it/s]2024-06-19 22:52:33,616 - FASTopic - Epoch: 185 loss: 544.769\n",
      "Training FASTopic:  94%|█████████▍| 189/200 [00:07<00:00, 25.31it/s]2024-06-19 22:52:33,815 - FASTopic - Epoch: 190 loss: 544.093\n",
      "Training FASTopic:  96%|█████████▌| 192/200 [00:07<00:00, 25.25it/s]2024-06-19 22:52:34,013 - FASTopic - Epoch: 195 loss: 543.424\n",
      "Training FASTopic:  99%|█████████▉| 198/200 [00:07<00:00, 25.36it/s]2024-06-19 22:52:34,212 - FASTopic - Epoch: 200 loss: 542.770\n",
      "Training FASTopic: 100%|██████████| 200/200 [00:07<00:00, 25.86it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic 0: diamond gateway laserwriter speedstar lciii winmarks iivx cirrus booting procomm imagewriter printers iifx adb syquest\n",
      "Topic 1: armenian armenians turkish turkey armenia genocide turks azerbaijan russian war greek history children killed soldiers\n",
      "Topic 2: ride ford rear bike riding bikes bmw wheel gear honda brake tires engine motorcycle helmet\n",
      "Topic 3: deleted folks friend wait sorry btw friends weeks havent license hear recall ago story notice\n",
      "Topic 4: ink experiences toner removable premium defective dock cheapest supplier plugged attach backup considering certified chassis\n",
      "Topic 5: icon width bitmap draw colormap print colors drawing symbol atm popup undefined truetype icons compile\n",
      "Topic 6: window windows mouse dos fonts application microsoft motif manager widget lib server xterm screen usr\n",
      "Topic 7: refund discs envelope centaur sol postage insured eclipse prototype elegant camry scoop continental shops arrays\n",
      "Topic 8: braves pitcher cubs pitching ball sox pitchers hitter pitch batting runner phillies mets pitches jays\n",
      "Topic 9: medical health disease patients cancer treatment drug study food doctor hiv aids medicine vitamin pain\n",
      "Topic 10: muratoff bobbs bloodshed huts reparations dunn revisionism rawlinson hartill kars caucasian exterminated aspirations memoirs pogroms\n",
      "Topic 11: agree argument opinion sense reason point feel society isnt right makes youre evidence question mean\n",
      "Topic 12: circumference pilots ladies arlington northwest hawaii tours capitol jenks birds hazards instantly taped eds directors\n",
      "Topic 13: dog surrender banks intellect gordon shameful skepticism chastity shit taxes bikers fuck stupidity drunk cnn\n",
      "Topic 14: animation cica shareware pcx mpeg ftpcicaindianaedu bmp frames povray shading corel dxf pic autocad eps\n",
      "Topic 15: msg corn migraine fever seizures allergies induce cooking pill prescription sensation neurons restaurants substances medications\n",
      "Topic 16: roger suck watching umpire umpires mediocre waving beastmaster gant deserved stance tossed hated blew boring\n",
      "Topic 17: gay homosexuals morality homosexual moral sex sexual objective behavior theism humans morals theists fallacy arrogant\n",
      "Topic 18: duke wibbled mlud mcwilliams wears swimming hollow whove incentive feustel acquiring brass squid fortune mcguire\n",
      "Topic 19: voltage amp audio stereo channel circuit sony electronics signal noise manuals obo frequency sale analog\n",
      "Topic 20: greatly replies appreciate netters designing instruction risc hello summarize radius nextstep schematic designer ics ideally\n",
      "Topic 21: elvis squid wibbled mlud washed halfway mcguire newbie crushed duke holiday tender longest swimming hows\n",
      "Topic 22: battery plastic tube metal camera film batteries wax lens paint tubes heat cleaner bulb lenses\n",
      "Topic 23: adultery bloodshed predecessors aspirations muratoff bobbs huts reparations debates leviticus pogroms hartill unite revisionism dunn\n",
      "Topic 24: institutional stuart notices mailbox carnegie admin mellon duplicate repost thanx xerox originals kindly transparent watson\n",
      "Topic 25: espn ticket abc sports tickets gld keenan stadium franchise octopus coverage maine championship miami recsporthockey\n",
      "Topic 26: victoria mailbox adress latex watson wiley hal institutional carnegie mailer xerox keystrokes runtime forwarding extracting\n",
      "Topic 27: thanx philips snail receipt quest inquire envelopes netland asap australia cubic shadows dial bbss paperback\n",
      "Topic 28: scsi drives controller ram card ide floppy modem drive mhz apple bios bus disks disk\n",
      "Topic 29: unnecessarily offend urbana publically virtue pity ruin annoyed phrasing mate emotionally dismissed disgusted maddi metaphor\n",
      "Topic 30: pit det bos pts tor chi van pittsburgh cal toronto rangers chicago que detroit period\n",
      "Topic 31: kent motto sarcasm truelove irony mom lunatic sarcastic mindset precedent bastard pardon christmas adjective credibility\n",
      "Topic 32: max cars launch car air station vehicle radar lunar miles satellite radio fuel market shuttle\n",
      "Topic 33: key keys algorithm scheme des phones encrypted pgp wiretap random cellular classified rsa plaintext scicrypt\n",
      "Topic 34: president encryption clipper government federal public clinton law enforcement private congress states myers administration escrow\n",
      "Topic 35: file image graphics ftp pub files jpeg format images available software program color version gif\n",
      "Topic 36: mary heaven catholic scripture grace matthew unto thou holy pope doctrine resurrection luke mormon marriage\n",
      "Topic 37: energy nasa moon space solar science earth spacecraft surface mission theory orbit planet mars venus\n",
      "Topic 38: arlington circumference taped birds ladies rusty brooks pilots northwest arrangements seating orlando tours lined hawaii\n",
      "Topic 39: arab islam israel arabs muslims islamic jewish jews palestinian peace palestinians quran bosnia nazis hitler\n",
      "Topic 40: game players hit team games season league player teams hockey hes year average play night\n",
      "Topic 41: internet email mail interested address art cover mailing copies send list appears wolverine copy books\n",
      "Topic 42: cheers followups subtle assumes masses invent threads inaccurate presume constitutes trivial sensible replying corrupt hmm\n",
      "Topic 43: casserole tlu gic feustel maddi tpm hci deter irish imprisoned medieval oath remeber ali enforced\n",
      "Topic 44: sanity retract urbana jsh thier annoyed goodness obnoxious frightening publically mathew insisted gosh unnecessarily criticize\n",
      "Topic 45: gun guns batf weapons firearms koresh crime fbi police assault weapon amendment militia adl armed\n",
      "Topic 46: god jesus bible christ christian faith christians gods church christianity religion love father atheists truth\n",
      "Topic 47: feustel chronicle helicopters courage ali merchants dressed abruptly mcwilliams swimming depart singing battles currency sighted\n",
      "Topic 48: paperwork awhile circumference teh hip slowing rolls rat student eliminating instantly decreasing hazards progressive metric\n",
      "Topic 49: sentiments follower andi almighty wedding witnessing hail rites passover sing leonard boast eternally visions sacrifices\n"
     ]
    }
   ],
   "source": [
    "trainer = topmost.FASTopicTrainer(dataset, verbose=True)\n",
    "top_words, doc_topic_dist = trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[37  6]\n"
     ]
    }
   ],
   "source": [
    "new_docs = [\n",
    "    \"This is a document about space, including words like space, satellite, launch, orbit.\",\n",
    "    \"This is a document about Microsoft Windows, including words like windows, files, dos.\"\n",
    "]\n",
    "\n",
    "new_theta = trainer.test(new_docs)\n",
    "print(new_theta.argmax(1))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch1.13py3.8",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
